Load Data

library(ggplot2)
library(data.table)
library(scales)
library(RColorBrewer)
library(reticulate)
library(uwot)
## Loading required package: Matrix
library(gridExtra)
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: nlme
## Loading required package: rpart
## 
## spatstat 1.63-3       (nickname: 'Wet paint') 
## For an introduction to spatstat, type 'beginner'
## 
## Attaching package: 'spatstat'
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:data.table':
## 
##     shift
use_condaenv(condaenv = 'r-reticulate', required = TRUE)

data_dir <- here::here('..','data')
load(file.path(data_dir, 'diff.sampled.Rdata'))
diff_dt <- fread(file.path(data_dir, 'diff.sampled.csv.gz'))

do_umap_param_search = T
censor_negative_min = -1500

# map day 0 to both plus and minus
map_day_0 <- function(df) {
  return(rbind(
    df[k562!='none'],
    df[k562=='none'][, k562 := 'cd19+'],
    df[k562=='none'][, k562 := 'cd19-']
  ))
}

UMAP

For now, skip straight to high dimensional plotting.

UMAP Functions

sample_for_umap <- function(df, sample_size) {
  umap_dt <- diff_dt[!is.na(event_id), 
    .SD[event_id %in% sample(unique(event_id), 
      min(sample_size, length(unique(event_id))))],
    by=c('well', 'plate', 'day')]
}

cast_for_umap <- function(df) {
  #cast it so variables are columns and
  #subset sample umap data on variables
  umap_cast_dt <- dcast(df, 
    event_id + well + plate + day + t_type ~ variable, 
    value.var='value')
  return(umap_cast_dt)
}

scale_for_umap <- function(df, umap_vars, censor_negative_min) {
  umap_dt_in <- df[, ..umap_vars]
  
  # for points that are very negative, trim the to below the cutoff
  umap_dt_in[umap_dt_in < censor_negative_min] <- censor_negative_min
  
  # scale each input column
  umap_dt_in[ , 
    (names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]
  
  return(umap_dt_in)
}
  
# check scales:
# ggplot(melt(cbind(umap_dt_in, id=1:nrow(umap_dt_in)), id.var='id'), 
#        aes(x=value)) +
#     geom_density() +
#     facet_grid(variable~.) +
#     scale_fill_distiller(palette='RdYlBu') +
#     theme_bw()

Choose UMAP Params

First, find parameters with a small sample.

#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map

# for now use all variables in the channel map except cd4/8
umap_vars <- c(paste0(names(channel_map),'_t'))
umap_vars <- umap_vars[umap_vars != 'cd_t']

# use umap canned functions
umap_dt <- sample_for_umap(diff_dt, 20)
umap_cast_dt <- cast_for_umap(umap_dt)
umap_dt_in <- scale_for_umap(umap_cast_dt, umap_vars, censor_negative_min)

### Run across parameter list

#umap parameter list
min_dist_set <- c(0.0001, 0.005, 0.1)
n_neighbor_set <- c(3,6,15,30)
umap_params <- data.table(expand.grid(min_dist_set, n_neighbor_set))

# run umap via uwot library
umap_out <- umap_params[, {
  print(paste0('Running: ',.BY)); 
  as.data.table(umap(umap_dt_in, min_dist=as.numeric(.BY[1]),
    n_neighbors=as.numeric(.BY[2]), verbose=F, n_threads=8, n_trees=50))
  }, by=names(umap_params)]
## [1] "Running: 1e-04" "Running: 3"    
## [1] "Running: 0.005" "Running: 3"    
## [1] "Running: 0.1" "Running: 3"  
## [1] "Running: 1e-04" "Running: 6"    
## [1] "Running: 0.005" "Running: 6"    
## [1] "Running: 0.1" "Running: 6"  
## [1] "Running: 1e-04" "Running: 15"   
## [1] "Running: 0.005" "Running: 15"   
## [1] "Running: 0.1" "Running: 15" 
## [1] "Running: 1e-04" "Running: 30"   
## [1] "Running: 0.005" "Running: 30"   
## [1] "Running: 0.1" "Running: 30"
# rename the columns
names(umap_out) <- c('min_dist','n_neighbor','umap_1','umap_2')

# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]

umap_fcs_dt <- unique(umap_dt[, 
  .(donor, car, k562, t_type, day, well, event_id)])[
    umap_fcs_dt, on=.(t_type,well,day,event_id)]
color_points <- ggplot(umap_fcs_dt[car %in% c('CD28','41BB','KLRG1','BAFF-R','Zeta')], 
    aes(x=umap_1, y=umap_2, color=interaction(car, t_type))) +
  geom_point(size=0.1, alpha=0.1) + 
  guides(colour = guide_legend(override.aes = list(alpha=1, size=3))) +
  facet_grid(min_dist~n_neighbor) +
  theme_bw()

color_density <- ggplot(umap_fcs_dt, aes(x=umap_1, y=umap_2)) +
  geom_hex(bins = 70) +
  scale_fill_continuous(
    type = "viridis", limits=c(0,30), oob=scales::squish) +
  facet_grid(min_dist~n_neighbor) +
  theme_bw()

grid.arrange(color_points, color_density, ncol=2)

Choosing neighbors == 15 and min_dist == 1e-4.

Checking UMAP Param space

Scaling this to 200 events per well and checking CAR/condition separation.

chosen_dist=0.005
chosen_n_neighbor=15

umap_dt <- diff_dt[!is.na(event_id), 
  .SD[event_id %in% sample(unique(event_id), 
    min(20, length(unique(event_id))))],
  by=c('well', 'plate', 'day')]

#grab channel map from cd4 data (cd4/8 are the same)
channel_map <- diff_opt_cd4$channel_map

# for now use all variables in the channel map
umap_vars <- c(paste0(names(channel_map),'_t'))

#cast it so variables are columns and
#subset sample umap data on variables
umap_cast_dt <- dcast(umap_dt, 
  event_id + well + plate + day + t_type ~ variable, 
  value.var='value')

umap_dt_in <- umap_cast_dt[, ..umap_vars]

# scale each input column
umap_dt_in[ , 
  (names(umap_dt_in)) := lapply(.SD, scale), .SDcols = names(umap_dt_in)]

# run umap via uwot library
umap_out <- umap(umap_dt_in, min_dist=chosen_dist,
    n_neighbors=chosen_n_neighbor, verbose=T, n_threads=8, n_trees=50,
    ret_nn=T)
## 19:45:44 UMAP embedding parameters a = 1.914 b = 0.7956
## 19:45:44 Read 14011 rows and found 10 numeric columns
## 19:45:44 Using Annoy for neighbor search, n_neighbors = 15
## 19:45:44 Building Annoy index with metric = euclidean, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 19:45:47 Writing NN index file to temp file /tmp/RtmpSOnjFQ/file143878f754e0
## 19:45:47 Searching Annoy index using 8 threads, search_k = 1500
## 19:45:48 Annoy recall = 100%
## 19:45:48 Commencing smooth kNN distance calibration using 8 threads
## 19:45:49 Initializing from normalized Laplacian + noise
## 19:45:49 Commencing optimization for 200 epochs, with 279118 positive edges
## 19:46:00 Optimization finished
# nearest neighbors in original space
nearest_neighbors <- umap_out$nn$euclidean$idx
neighbor_dist <- umap_out$nn$euclidean$dist
edge_weights <- scales::rescale(-neighbor_dist, to=c(0,1))
edge_weights <- edge_weights - min(edge_weights)
umap_out <- data.table(umap_out$embedding)

#nearest neighbors in embedding
umap_nn <- cbind(1:nrow(umap_fcs_dt), 
    nnwhich(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_nd <- cbind(rep(0, nrow(umap_fcs_dt)), 
    nndist(umap_fcs_dt[, list(umap_1, umap_2)], k=c(1:3)))
umap_ew <- scales::rescale(-umap_nd, to=c(0,1))
umap_ew <- umap_ew - min(umap_ew)

    
# rename the columns
names(umap_out) <- c('umap_1','umap_2')

# add the umap output to the input dt
umap_fcs_dt <- cbind(umap_cast_dt[, 1:length(names(umap_cast_dt))], umap_out)
umap_fcs_dt[, id := 1:.N]

# add back the annotations
umap_fcs_dt <- unique(umap_dt[, 
  .(donor, car, k562, t_type, day, well, event_id)])[
    umap_fcs_dt, on=.(t_type,well,day,event_id)]
cd4_colors = brewer.pal('Greens', n=9)[c(2,4,6,8,9)]
cd8_colors = brewer.pal('Oranges', n=9)[c(2,4,6,8,9)]

color_time <- ggplot(umap_fcs_dt[][, cd19 := (k562=='cd19+')], 
    aes(x=umap_1, y=umap_2, color=interaction(day, t_type))) +
  geom_point(size=0.1, alpha=0.5) +
  facet_grid(car~cd19) +
  scale_color_manual(values=c(cd4_colors, cd8_colors)) +
  guides(colour = guide_legend(override.aes = list(alpha=1, size=3)))

color_density <- ggplot(umap_fcs_dt, aes(x=umap_1, y=umap_2)) +
  geom_hex(bins = 70) +
  facet_grid(car~cd19) +
  scale_fill_continuous(
    type = "viridis", limits=c(0,30), oob=scales::squish) +
  theme_bw()

color_markers <- ggplot(
    melt(umap_fcs_dt, measure.vars=umap_vars)[, 
      value.scaled := scale(value), by='variable'][,
        cd19 := (k562=='cd19+')], 
    aes(x=umap_1, y=umap_2, z=value.scaled)) +
  stat_summary_hex(bins = 70) +
  facet_grid(variable~cd19) +
  scale_fill_distiller(palette='RdYlBu', limits=c(-3, 3), oob=scales::squish) +
  theme_bw()

grid.arrange(color_time, color_density, color_markers, ncol=3)

Clustering via leiden/reticulate

import leidenalg
import igraph as ig
import pandas as pd
import numpy as np
import math

edges = []
n_neighbors = r.nearest_neighbors.shape[1]
for i, nn_row in enumerate(r.nearest_neighbors):
    edge_weights = r.edge_weights[i]
    for j in range(1, n_neighbors):
        edges.append((int(nn_row[0]), int(nn_row[j]), edge_weights[j]))
      
knn_graph = ig.Graph.TupleList(edges, directed=True, weights=True)

part = leidenalg.find_partition(knn_graph, 
    leidenalg.ModularityVertexPartition, weights='weight')
#part = leidenalg.find_partition(knn_graph, 
#    leidenalg.CPMVertexPartition, weights='weight', 
#    resolution_parameter=0.05)

cluster_membership = [x for _,x in sorted(zip(knn_graph.vs['name'],part.membership))]
umap_fcs_dt[, cluster := factor(py$cluster_membership+1)]

umap_cluster_dt <- umap_fcs_dt[, list(
  mean_umap_1= mean(umap_1), 
  mean_umap_2= mean(umap_2), 
  size= .N), by=cluster]

ggplot() + 
  geom_point(data=umap_fcs_dt, 
    aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
  geom_label(data=umap_cluster_dt, aes(
    x=mean_umap_1, y=mean_umap_2,
    label=paste(cluster,size,sep='\n'), 
    color=cluster), alpha=0.3) +
  theme_minimal()

ggplot() + 
  geom_point(data=umap_fcs_dt, 
    aes(x=umap_1, y=umap_2, color=cluster), size=0.2, alpha=0.2) +
  geom_label(data=umap_cluster_dt, aes(
    x=mean_umap_1, y=mean_umap_2,
    label=paste(cluster,size,sep='\n'), 
    color=cluster), alpha=0.3) +
  facet_grid(~cluster) +
  theme_minimal()